Source Code of org.terrier.indexing.BlockIndexer$BasicTermProcessor

/*
 * Terrier - Terabyte Retriever 
 * Webpage: http://terrier.org 
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.ac.uk/
 * 
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is BlockIndexer.java.
 *
 * The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 * Craig Macdonald <craigm{a.}dcs.gla.ac.uk> 
 * Vassilis Plachouras <vassilis{a.}dcs.gla.ac.uk>
 * Rodrygo Santo <rodrygo{a.}dcs.gla.ac.uk>
 */
package org.terrier.indexing;
import gnu.trove.THashSet;
import gnu.trove.TIntHashSet;


import java.io.IOException;
import java.util.Map;
import java.util.Set;


import org.terrier.compression.BitIn;
import org.terrier.structures.BasicDocumentIndexEntry;
import org.terrier.structures.BitIndexPointer;
import org.terrier.structures.BlockDirectInvertedOutputStream;
import org.terrier.structures.BlockFieldDirectInvertedOutputStream;
import org.terrier.structures.BlockLexiconEntry;
import org.terrier.structures.DocumentIndexEntry;
import org.terrier.structures.FieldDocumentIndexEntry;
import org.terrier.structures.FieldLexiconEntry;
import org.terrier.structures.Index;
import org.terrier.structures.indexing.BlockDocumentPostingList;
import org.terrier.structures.indexing.BlockFieldDocumentPostingList;
import org.terrier.structures.indexing.BlockFieldLexiconMap;
import org.terrier.structures.indexing.BlockInvertedIndexBuilder;
import org.terrier.structures.indexing.BlockLexiconMap;
import org.terrier.structures.indexing.DocumentIndexBuilder;
import org.terrier.structures.indexing.DocumentPostingList;
import org.terrier.structures.indexing.LexiconBuilder;
import org.terrier.structures.postings.BlockFieldIterablePosting;
import org.terrier.structures.postings.BlockIterablePosting;
import org.terrier.terms.TermPipeline;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.ArrayUtils;
import org.terrier.utility.FieldScore;
import org.terrier.utility.TermCodes;
/**
 * An indexer that saves block information for the indexed terms. Block information is usualy recorded in terms of relative term positions (position 1, positions 2, etc),
 * however, since 2.2, Terrier supports the presence of "marker terms" during indexing which are used to increment the block counter.
 * <B>Properties:</b>
 * <ul> 
 * <li><tt>block.size</tt> - How many terms should be in one block. If you want to use phrasal search, this need to be 1 (default).</li>
 * <li><tt>max.blocks</tt> - Maximum number of blocks in a document. After this number of blocks, all subsequent terms will be in the same block. Default 100,000</li>
 * <li><tt>block.indexing</tt> - This class should only be used if the <tt>block.indexing</tt> property is set.</li>
 * <li>indexing.max.encoded.documentindex.docs - how many docs before the DocumentIndexEncoded is dropped in favour of the DocumentIndex (on disk implementation).</li>
 * <li>string.use_utf - use the UTF index structures?</li>
 * <li><i>See Also: Properties in </i>org.terrier.indexing.Indexer <i>and</i> org.terrier.indexing.BasicIndexer</li>
 * </ul>
 * <p><b>Markered Blocks</b><br>Markers are terms (artificially inserted or otherwise into the term stream that are used to denote when the block counter should
 * be incremented. This functionality is enabled using the <tt>block.delimiters.enabled</tt> property, while the terms are specified using a comma delimited fashion with the
 * <tt>block.delimiters</tt> property. The following lists the properties:
 * <ul>
 * <li><tt>block.delimiters.enabled</tt> - enabled markered blocks. Defaults to false, set to true to enable.</li>
 * <li><tt>block.delimiters</tt> - comma delimited list of terms that are markers. Defaults to empty. Terms are lowercased is <tt>lowercase</tt> is set to true (default).</li>
 * <li><tt>block.delimiters.index.terms</tt> - set to true if markers terms should actually be indexed. Defaults to false.</li>
 * <li><tt>block.delimiters.index.doclength</tt> - set to true if markers terms should contribute to document length. Defaults to false, only has effect if 
 * <tt>block.delimiters.index.terms</tt> is set. </li>
 * </ul>
 * @author Craig Macdonald, Vassilis Plachouras, Rodrygo Santos
 */
public class BlockIndexer extends Indexer {
  
  /** This class implements an end of a TermPipeline that adds the
   *  term to the DocumentTree. This TermProcessor does NOT have field
   *  support.
   */   
  protected class BasicTermProcessor implements TermPipeline {
    public void processTerm(String t) {
      //  null means the term has been filtered out (eg stopwords)
      if (t != null) {
        //add term to thingy tree
        ((BlockDocumentPostingList)termsInDocument).insert(t, blockId);
        numOfTokensInDocument++;
        if (++numOfTokensInBlock >= BLOCK_SIZE && blockId < MAX_BLOCKS) {
          numOfTokensInBlock = 0;
          blockId++;
        }
      }
    }
    
    public boolean reset() {
      return true;
    }
  }
  /** 
   * This class implements an end of a TermPipeline that adds the
   * term to the DocumentTree. This TermProcessor does have field
   * support.
   */
  protected class FieldTermProcessor implements TermPipeline {
    final TIntHashSet fields = new TIntHashSet(numFields);
    final boolean ELSE_ENABLED = fieldNames.containsKey("ELSE");
    final int ELSE_FIELD_ID = fieldNames.get("ELSE") -1;
    public void processTerm(String t) {
      //  null means the term has been filtered out (eg stopwords)
      if (t != null) {
        //add term to document posting list
        for (String fieldName: termFields)
        {
          int tmp = fieldNames.get(fieldName);
          if (tmp > 0)
          {
            fields.add(tmp -1);
          }
        }
        if (ELSE_ENABLED && fields.size() == 0)
        {
          fields.add(ELSE_FIELD_ID);
        }
        ((BlockFieldDocumentPostingList)termsInDocument).insert(t,fields.toArray(), blockId);
        numOfTokensInDocument++;
        if (++numOfTokensInBlock >= BLOCK_SIZE && blockId < MAX_BLOCKS) {
          numOfTokensInBlock = 0;
          blockId++;
        }
        fields.clear();
      }
    }
    
    public boolean reset() {
      return true;
    }
  }


  /**
   * This class behaves in a similar fashion to BasicTermProcessor except that
   * this one treats blocks bounded by delimiters instead of fixed-sized blocks.
   * @author Rodrygo Santos
   * @since 2.2
   */
  protected class DelimTermProcessor implements TermPipeline {
    protected THashSet<String> blockDelimiterTerms;
    protected final boolean indexDelimiters;
    protected final boolean countDelimiters;
    
    public DelimTermProcessor(String[] _delims, boolean _indexDelimiters, boolean _countDelimiters) {
      blockDelimiterTerms = new THashSet<String>();
      for (String t : _delims)
        blockDelimiterTerms.add(t);
      indexDelimiters = _indexDelimiters;
      countDelimiters = _countDelimiters;
    }
    
    public void processTerm(String t) {
      if (t== null)
        return;
      // current term is a delimiter
      if (blockDelimiterTerms.contains(t)) {
        // delimiters should also be indexed
        if (indexDelimiters) {
            ((BlockDocumentPostingList)termsInDocument).insert(t, blockId);
            if (countDelimiters)
                numOfTokensInDocument++;
        }
        numOfTokensInBlock = 0;
        blockId++;
      }
      else {
        // index non-delimiter term
        ((BlockDocumentPostingList)termsInDocument).insert(t, blockId);
        numOfTokensInDocument++;
      }
    }
    
    public boolean reset() {
      return true;
    }
  }


  /**
   * This class behaves in a similar fashion to FieldTermProcessor except that
   * this one treats blocks bounded by delimiters instead of fixed-sized blocks.
   * @author Rodrygo Santos
   * @since 2.2
   */
  protected class DelimFieldTermProcessor implements TermPipeline {
    protected final THashSet<String> blockDelimiterTerms;
    protected final boolean indexDelimiters;
    protected final boolean countDelimiters;


    public DelimFieldTermProcessor(String[] _delims, boolean _indexDelimiters, boolean _countDelimiters) {
      blockDelimiterTerms = new THashSet<String>();
      for (String t : _delims)
        blockDelimiterTerms.add(t);
      indexDelimiters = _indexDelimiters;
      countDelimiters = _countDelimiters;
    }


    public void processTerm(String t) {
      if (t== null)
        return;
      // current term is a delimiter
      if (blockDelimiterTerms.contains(t)) {
        // delimiters should also be indexed
        if (indexDelimiters)
        {
          final int[] fieldIds = new int[numFields];
          int i=0;
          for (String fieldName: termFields)
          {
            fieldIds[i] = fieldNames.get(fieldName);
            i++;
          }
          ((BlockFieldDocumentPostingList)termsInDocument).insert(t, fieldIds, blockId);
          if (countDelimiters)
            numOfTokensInDocument++;
        }
        numOfTokensInBlock = 0;
        blockId++;
        }
        else {
        // index non-delimiter term
        final int[] fieldIds = new int[numFields];
        int i=0;
        for (String fieldName: termFields)
        {
          fieldIds[i] = fieldNames.get(fieldName);
          i++;
        }
        ((BlockFieldDocumentPostingList)termsInDocument).insert(t, fieldIds, blockId);
        numOfTokensInDocument++;
      }
    }
    
    public boolean reset() {
      return true;
    }
  }


  /** The number of tokens in the current document so far. */
  protected int numOfTokensInDocument = 0;
  /** The number of tokens in the current block of the current document. */
  protected int numOfTokensInBlock = 0;
  /** The block number of the current document. */
  protected int blockId;
  /** The fields that are set for the current term. */
  protected Set<String> termFields = null;
  /** The list of terms in this document, and for each, the block occurrences. */
  protected DocumentPostingList termsInDocument = null;
  /** The maximum number of terms allowed in a block. See Property <tt>blocks.size</tt> */
  protected int BLOCK_SIZE;
  /** 
   * The maximum number allowed number of blocks in a document. 
   * After this value, all the remaining terms are in the final block. See Property <tt>max.blocks</tt>. */
  protected int MAX_BLOCKS;


  /** Constructs an instance of this class, where the created data structures
    * are stored in the given path, with the given prefix on the filenames.
    * @param pathname String the path in which the created data structures will be saved. This is assumed to be
   * absolute.
    * @param prefix String the prefix on the filenames of the created data structures, usually "data"
    */
  public BlockIndexer(String pathname, String prefix) {
    super(pathname, prefix);
    if (this.getClass() == BlockIndexer.class)
      init();
    basicDirectIndexPostingIteratorClass = BlockIterablePosting.class.getName();
    fieldDirectIndexPostingIteratorClass = BlockFieldIterablePosting.class.getName();
  }


  /** 
   * Returns the object that is to be the end of the TermPipeline. 
   * This method is used at construction time of the parent object. 
   * @return TermPipeline the last component of the term pipeline.
   */
  protected TermPipeline getEndOfPipeline() {
    // if using delimited blocks
    if (Boolean.parseBoolean(ApplicationSetup.getProperty("block.delimiters.enabled", "false"))) 
    {
      String delim = ApplicationSetup.getProperty("block.delimiters", "").trim();
      if (Boolean.parseBoolean(ApplicationSetup.getProperty("lowercase", "true")))
        delim = delim.toLowerCase();
      String delims[] = delim.split("\\s*,\\s*");
      final boolean indexDelims = Boolean.parseBoolean(ApplicationSetup.getProperty("block.delimiters.index.terms", "false"));
      final boolean countDelims = Boolean.parseBoolean(ApplicationSetup.getProperty("block.delimiters.index.doclength","true"));
      return (FieldScore.USE_FIELD_INFORMATION)
        ? new DelimFieldTermProcessor(delims, indexDelims, countDelims)
        : new DelimTermProcessor(delims, indexDelims, countDelims);
    }
    else if (FieldScore.USE_FIELD_INFORMATION) {
      return new FieldTermProcessor();
    }
    return new BasicTermProcessor();
  }
  
  


  /**
   * For the given collection, it iterates through the documents and
   * creates the direct index, document index and lexicon, using 
   * information about blocks and possibly fields.
   * @param collections Collection[] the collection to index.
   * @see org.terrier.indexing.Indexer#createDirectIndex(org.terrier.indexing.Collection[])
   */
  //TODO if this class extends BasicIndexer, then this method could be inherited
  public void createDirectIndex(Collection[] collections) {
  //  //logger.info("BlockIndexer creating direct index"+ 
  //    (Boolean.parseBoolean(ApplicationSetup.getProperty("block.delimiters.enabled", "false"))
  //    ? " delimited-block indexing enabled" : ""));
    currentIndex = Index.createNewIndex(path, prefix);
    lexiconBuilder = FieldScore.FIELDS_COUNT > 0
      ? new LexiconBuilder(currentIndex, "lexicon", new BlockFieldLexiconMap(FieldScore.FIELDS_COUNT), FieldLexiconEntry.class.getName())
      : new LexiconBuilder(currentIndex, "lexicon", new BlockLexiconMap(), BlockLexiconEntry.class.getName());
    //lexiconBuilder = new BlockLexiconBuilder(currentIndex, "lexicon");
    try{
      directIndexBuilder =  FieldScore.FIELDS_COUNT > 0
        ? new BlockFieldDirectInvertedOutputStream(currentIndex.getPath() + ApplicationSetup.FILE_SEPARATOR + currentIndex.getPrefix() + "." + "direct" + BitIn.USUAL_EXTENSION)
        : new BlockDirectInvertedOutputStream(currentIndex.getPath() + ApplicationSetup.FILE_SEPARATOR + currentIndex.getPrefix() + "." + "direct" + BitIn.USUAL_EXTENSION);
    } catch (IOException ioe) {
  //    logger.error("Cannot make DirectInvertedOutputStream:", ioe);
    }
    docIndexBuilder = new DocumentIndexBuilder(currentIndex, "document");
    metaBuilder = createMetaIndexBuilder();
    emptyDocIndexEntry = (FieldScore.FIELDS_COUNT > 0) ? new FieldDocumentIndexEntry(FieldScore.FIELDS_COUNT) : new BasicDocumentIndexEntry();
    
    //int LexiconCount = 0;
    int numberOfDocuments = 0;
    int numberOfTokens = 0;
    //long startBunchOfDocuments = System.currentTimeMillis();
    final boolean boundaryDocsEnabled = BUILDER_BOUNDARY_DOCUMENTS.size() > 0;
    boolean stopIndexing = false;
    for(int collectionNo = 0; !stopIndexing && collectionNo < collections.length; collectionNo++)
    {
      Collection collection = collections[collectionNo];
      long startCollection = System.currentTimeMillis();
      boolean notLastDoc = false;
      //while(notLastDoc = collection.hasNext()) {
      while ((notLastDoc = collection.nextDocument())) {
        //get the next document from the collection
        
        //String docid = collection.getDocid();
        //Document doc = collection.next();
        Document doc = collection.getDocument();
        
        if (doc == null)
          continue;
        
        numberOfDocuments++;
        //setup for parsing
        createDocumentPostings();
        String term;
        numOfTokensInDocument = 0;
        numOfTokensInBlock = 0;
        blockId = 0;
        //get each term in the document
        while (!doc.endOfDocument()) {
          if ((term = doc.getNextTerm()) != null && 
            !term.equals("")) {
            termFields = doc.getFields();
            //pass term into TermPipeline (stop, stem etc)
            pipeline_first.processTerm(term);
            //the term pipeline will eventually add the term to this
            // object.
          }
          if (MAX_TOKENS_IN_DOCUMENT > 0 && 
            numOfTokensInDocument > MAX_TOKENS_IN_DOCUMENT)
            break;
        }
        //if we didn't index all tokens from document,
        //we need to get to the end of the document.
        while (!doc.endOfDocument()) 
          doc.getNextTerm();
        //we now have all terms in the DocumentTree
  
        pipeline_first.reset();
        //process DocumentTree (tree of terms)
        try
        {
          if (termsInDocument.getDocumentLength() == 0) { 
            //this document is empty, add the
            // minimum to the document index
            indexEmpty(doc.getAllProperties());
          } else { /* index this docuent */
            numberOfTokens += numOfTokensInDocument;
            indexDocument(doc.getAllProperties(), termsInDocument);
          }
        }
        catch (Exception ioe)
        {
    //      logger.error("Failed to index "+doc.getProperty("docno"),ioe);
        }
        if (MAX_DOCS_PER_BUILDER>0 && numberOfDocuments >= MAX_DOCS_PER_BUILDER)
        {
          stopIndexing = true;
          break;
        }


        if (boundaryDocsEnabled && BUILDER_BOUNDARY_DOCUMENTS.contains(doc.getProperty("docno")))
        {
          stopIndexing = true;
          break;
        }
      }
      long endCollection = System.currentTimeMillis();
      long secs = ((endCollection-startCollection)/1000);
  //    //logger.info("Collection #"+collectionNo+ " took "+secs+"seconds to index "
  //      +"("+numberOfDocuments+" documents)\n");
  //    if (secs > 3600)
  //       //logger.info("Rate: "+((double)numberOfDocuments/((double)secs/3600.0d))+" docs/hour");


      if (! notLastDoc)
      {
        try{
          collection.close();
        } catch (IOException e) {
  //        //logger.warn("Couldnt close collection", e);
        }
      }
    }


    /* end of the collection has been reached */
    finishedDirectIndexBuild();
    currentIndex.addIndexStructure(
        "direct", 
        "org.terrier.structures.BlockDirectIndex", 
        "org.terrier.structures.Index,java.lang.String,java.lang.Class", 
        "index,structureName,"+ 
        (FieldScore.FIELDS_COUNT > 0 ? fieldDirectIndexPostingIteratorClass : basicDirectIndexPostingIteratorClass));
    currentIndex.addIndexStructureInputStream(
        "direct",
        "org.terrier.structures.BlockDirectIndexInputStream", 
        "org.terrier.structures.Index,java.lang.String,java.lang.Class",
        "index,structureName,"+ 
          (FieldScore.FIELDS_COUNT > 0 ? fieldDirectIndexPostingIteratorClass : basicDirectIndexPostingIteratorClass));
    currentIndex.setIndexProperty("index.direct.fields.count", ""+FieldScore.FIELDS_COUNT );
    currentIndex.setIndexProperty("index.direct.fields.names", ArrayUtils.join(FieldScore.FIELD_NAMES, ","));
    if (FieldScore.FIELDS_COUNT > 0)
    {
      currentIndex.addIndexStructure("document-factory", FieldDocumentIndexEntry.Factory.class.getName(), "java.lang.String", "${index.direct.fields.count}");
    }
    else
    {
      currentIndex.addIndexStructure("document-factory", BasicDocumentIndexEntry.Factory.class.getName(), "", "");
    }
    /* flush the index buffers */
    directIndexBuilder.close();
    docIndexBuilder.finishedCollections();
    /* and then merge all the temporary lexicons */
    lexiconBuilder.finishedDirectIndexBuild();
    try {
      metaBuilder.close();
    } catch (IOException ioe) {
  //    logger.error("Could not finish MetaIndexBuilder: ", ioe);
    }
    if (FieldScore.FIELDS_COUNT > 0)
    {
      currentIndex.addIndexStructure("lexicon-valuefactory", FieldLexiconEntry.Factory.class.getName(), "java.lang.String", "${index.direct.fields.count}");
    }
    /* reset the in-memory mapping of terms to term codes.*/
    TermCodes.reset();
    System.gc();
    try {
      currentIndex.flush();
    } catch (IOException ioe) {
  //    logger.error("Could not flush index properties: ", ioe);
    }
  }


  /** 
   * This adds a document to the direct and document indexes, as well 
   * as it's terms to the lexicon. Handled internally by the methods 
   * indexFieldDocument and indexNoFieldDocument.
   * @param docProperties Map<String,String> properties of the document
   * @param _termsInDocument DocumentPostingList the terms in the document.
   * 
   */
  protected void indexDocument(Map<String,String> docProperties, DocumentPostingList _termsInDocument) throws Exception
  {
    /* add words to lexicontree */
    lexiconBuilder.addDocumentTerms(_termsInDocument);
    /* add doc postings to the direct index */
    BitIndexPointer dirIndexPost = directIndexBuilder.writePostings(_termsInDocument.getPostings2());
    /* add doc to documentindex */
    DocumentIndexEntry die = _termsInDocument.getDocumentStatistics();
    die.setBitIndexPointer(dirIndexPost);
    docIndexBuilder.addEntryToBuffer(die);
    /** add doc metadata to index */
    metaBuilder.writeDocumentEntry(docProperties);
  }
  
  /**
   * Creates the inverted index from the already created direct index,
   * document index and lexicon. It saves block information and possibly
   * field information as well.
   * @see org.terrier.indexing.Indexer#createInvertedIndex()
   */
  public void createInvertedIndex() {
    if (currentIndex == null)
    {
      currentIndex = Index.createIndex(path,prefix);
      if (currentIndex == null)
      {
  //      logger.error("No index at ("+path+","+prefix+") to build an inverted index for ");
      }
    }
    long beginTimestamp = System.currentTimeMillis();


    if (currentIndex.getCollectionStatistics().getNumberOfUniqueTerms() == 0)
    {
  //    logger.error("Index has no terms. Inverted index creation aborted.");
      return;
    }
    if (currentIndex.getCollectionStatistics().getNumberOfDocuments() == 0)
    {
  //    logger.error("Index has no documents. Inverted index creation aborted.");
      return;
    }


  //  //logger.info("Started building the block inverted index...");
    invertedIndexBuilder = new BlockInvertedIndexBuilder(currentIndex, "inverted");
    invertedIndexBuilder.createInvertedIndex();
    this.finishedInvertedIndexBuild();
    try{
      currentIndex.flush();
    } catch (IOException ioe) {
  //    logger.error("Cannot flush index: ", ioe);
    }


    long endTimestamp = System.currentTimeMillis();
  //  //logger.info("Finished building the block inverted index...");
    long seconds = (endTimestamp - beginTimestamp) / 1000;
  //  //logger.info("Time elapsed for inverted file: " + seconds);
  }




  /** Hook method, called when the inverted index is finished - ie the lexicon is finished */
  protected void finishedInvertedIndexBuild()
  {
    LexiconBuilder.optimise(currentIndex, "lexicon");
  }


  
  protected void createDocumentPostings(){
    if (FieldScore.FIELDS_COUNT > 0)
      termsInDocument = new BlockFieldDocumentPostingList(FieldScore.FIELDS_COUNT);
    else
      termsInDocument = new BlockDocumentPostingList();
    blockId = 0;
    numOfTokensInBlock = 0;  
  }


  @Override
  protected void load_indexer_properties() {
    super.load_indexer_properties();
    BLOCK_SIZE = ApplicationSetup.BLOCK_SIZE;
    MAX_BLOCKS = ApplicationSetup.MAX_BLOCKS;
  }
}
Source Code of org.terrier.indexing.BlockIndexer$BasicTermProcessor

Related Classes of org.terrier.indexing.BlockIndexer$BasicTermProcessor